Fri Aug 23 12:42:09 2024
Import the Seurat object: previously processed, annotated with cell labels, assigned gene-set scores using AUCell (cancersea state signature scores), and cells that contain EGFP plasmid fragments have been labeled.
seu <- readRDS(file.path(projDir, 'single_cell_analysis', 'seu.RDS'))
# simplify cell type labels (group rare ones into "other")
seu$celltype <- ifelse(seu$celltype %in% names(which(table(seu$celltype) < 100)),
'other', seu$celltype)
# update group labels
conditions <- list('PoolA' = 'Recovery',
'PoolB' = 'DSS',
'PoolC' = 'Untreated')
seu$condition <- paste(conditions[seu$sample])
egfp_cells <- colnames(seu)[seu$egfp_status == 'EGFP_POS']
From the single-cell sequencing of samples under 3 conditions, we managed to pick up about 1.5% of cells to contain at least one unique read originating from the EGFP-plasmid. Those cells are supposed to contain NFKB promoters thus, EGFP-positive cells are likely to contain increased NFKB activity.
knitr::kable(table(seu$egfp_status, seu$condition),
caption = 'Number of cells per condition categorized by EGFP status')
| DSS | Recovery | Untreated | |
|---|---|---|---|
| EGFP_NEG | 5257 | 7153 | 6561 |
| EGFP_POS | 76 | 116 | 116 |
Note: Black dots represent cells for which we could pick up at least one EGFP-plasmid fragment.
p <- DimPlot(seu, group.by = 'celltype')
df <- p[[1]]$data
p + geom_point(data = df[egfp_cells,], aes(x = umap_1, y = umap_2),
color = 'black', size = 1)
p <- DimPlot(seu, group.by = 'condition')
df <- p[[1]]$data
p + geom_point(data = df[egfp_cells,], aes(x = umap_1, y = umap_2),
color = 'black', size = 1)
p <- DimPlot(seu, group.by = 'celltype', split.by = 'condition')
df <- p[[1]]$data
p + geom_point(data = df[egfp_cells,], aes(x = umap_1, y = umap_2),
color = 'black', size = 1)
Comparison of cancer state signature scores in egfp+ and egfp- cells in different conditions
sigs <- colnames(seu@meta.data)[10:23]
dt <- data.table(seu@meta.data)
mdt <- melt.data.table(dt, measure.vars = sigs)
ggplot(mdt, aes(x = value, y = variable)) +
geom_density_ridges(aes(fill = egfp_status), alpha = 0.5) +
scale_fill_manual(values = c('blue', 'red')) +
labs(y = 'Signature', x = 'AUCell score')
ggplot(mdt, aes(x = value, y = variable)) +
geom_density_ridges(aes(fill = condition), alpha = 0.5) +
labs(y = 'Signature', x = 'AUCell score')
ggplot(mdt, aes(x = value, y = variable)) +
geom_density_ridges(aes(fill = egfp_status), alpha = 0.5) +
scale_fill_manual(values = c('blue', 'red')) +
labs(y = 'Signature', x = 'AUCell score') +
facet_grid(~ condition)
Now, we focus on epithelial cells (cells annotated as epithelial and also are cdh1+) to look for NKFB activity signatures by comparing EGFP+ cells iwth EGFP- cells in different conditions.
For each gene, we check ratio of epithelial cells expressing the marker in different conditions; split by egfp status
qPCR_markers <- c('Egfp', 'Lgr5', 'Chga', 'Muc2', 'Dclk1', 'Tnfaip3', 'Noxa1',
'Tnf', 'Bcl2l1', 'Bcl2')
M <- GetAssayData(seu)
epithelial_cells <- intersect(colnames(seu)[seu$celltype == 'Epithelial cells'],
names(which(M['Cdh1',] > 0)))
# setdiff(intersect(colnames(seu)[seu$celltype == 'Epithelial cells'],
# names(which(M['Cdh1',] > 0))),
# names(which(M['Ptprc',] > 0)))
seu_epi <- seu[,epithelial_cells]
dt <- data.table(seu_epi@meta.data, keep.rownames = T)
dt <- cbind(dt, sapply(qPCR_markers, function(x) {
M[x, dt$rn]
}))
df <- do.call(rbind, lapply(qPCR_markers, function(x) {
s <- dt[,list('percent_expressed' = round(sum(get(x) > 0)/length(get(x)) * 100, 1),
'avg_expression' = mean(get(x))),by = c('condition', 'egfp_status')]
s$gene <- x
return(s)
}))
df$egfp_status <- factor(df$egfp_status, levels = c('EGFP_POS', 'EGFP_NEG'))
df$condition <- factor(df$condition, levels = c('Untreated', 'DSS', 'Recovery'))
ggplot(df, aes(x = egfp_status, y = percent_expressed)) +
geom_bar(stat = 'identity', aes(fill = condition), position = 'dodge') +
facet_wrap(~ gene, scales = 'free', nrow = 2) +
scale_fill_brewer(type = 'qual', palette = 6)
# find markers for egfp+/egpf- in cdh1+ epithelial cells
markers <- sapply(simplify = F, unique(seu_epi$condition), function(x) {
message(date(), " => computing markers for condition ",x)
dt <- data.table(FindMarkers(seu_epi[,seu_epi$condition == x],
min.pct = 0.1,
logFoldChange = 0.25,
test.use = 'wilcox',
ident.1 = 'EGFP_POS',
ident.2 = 'EGFP_NEG',
only.pos = TRUE,
group.by = 'egfp_status'),
keep.rownames = T)
dt$condition <- x
return(dt)
})
markers <- do.call(rbind, markers)
m <- markers[pct.1 > 0.15][p_val_adj < 0.1][order(p_val_adj), .SD[1:21], by = .(condition)][!is.na(rn)][rn != 'Egfp']
p <- DotPlot(seu_epi[,seu_epi$egfp_status == 'EGFP_POS'],
features = unique(m$rn),
group.by = 'condition', scale = F) + coord_flip()
df <- p$data
df$id <- factor(df$id, levels = c('Untreated', 'DSS', 'Recovery'))
ggplot(df, aes(x = features.plot, y = id)) +
geom_point(aes(color = avg.exp.scaled, size = pct.exp)) +
labs(color = "Average\nExpression", size = "Percent\nExpression") +
# scale_color_gradient(low = '#672976', high = 'yellow') +
scale_color_gradientn(colors = colorRampPalette(c("#313e93", "#b252a2", "#f4a16f", "#eeea59"))(50)) +
scale_size(range = c(0, 6)) +
theme(axis.title = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, face = 'italic'),
axis.text = element_text(size = 14, family = 'Arial'),
legend.title = element_text(family = 'Arial'))
# subset of genes of interest
# I don't know how these are selected by Marina
goi <- c('Steap4', 'Smim19', 'Cox17', 'Nfkbib', 'Man2a2', 'Sept10', 'Acot13', 'Ndufv2', 'Mrps15', 'Aqp8', 'H2-Eb1', 'Ly6g', 'Atf3')
p <- DotPlot(seu_epi[,seu_epi$egfp_status == 'EGFP_POS'],
features = goi,
group.by = 'condition', scale = F) + coord_flip()
df <- p$data
df$id <- factor(df$id, levels = c('Untreated', 'DSS', 'Recovery'))
ggplot(df, aes(x = features.plot, y = id)) +
geom_point(aes(color = avg.exp.scaled, size = pct.exp)) +
labs(color = "Average\nExpression", size = "Percent\nExpression") +
scale_color_gradientn(colors = colorRampPalette(c("#313e93", "#b252a2", "#f4a16f", "#eeea59"))(50)) +
scale_size(range = c(0, 6)) +
theme(axis.title = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, face = 'italic'),
axis.text = element_text(size = 14, family = 'Arial'),
legend.title = element_text(family = 'Arial'))
Found top markers expressed in at least 15% of EGFP+ cells and filtered for padj < 0.1
colorPal <- colorRampPalette(c("darkorchid4", "yellow"))(50)
M <- GetAssayData(seu_epi)
#top <- markers[pct.1 > 0.15][p_val_adj < 0.1]
top <- markers[pct.1 > 0.15][p_val_adj < 0.1][order(p_val_adj), .SD[1:20], by = .(condition)][!is.na(rn)]
B <- get_basis_matrix(t(as.matrix(M[unique(top$rn),])),
as.factor(paste(seu_epi$condition, seu_epi$egfp_status)))
sample_order <- c('Untreated EGFP_POS', 'Untreated EGFP_NEG',
'DSS EGFP_POS', 'DSS EGFP_NEG',
'Recovery EGFP_POS', 'Recovery EGFP_NEG')
pheatmap::pheatmap(t(B[sample_order,]), scale = 'row', cluster_cols = F,
cellwidth = 20, fontsize_row = 9, color = colorPal,
gaps_col = c(2,4))
# import nfkb target genes; (for marker analysis )
nfkb <- readLines('/data/local/buyar/collaborations/marina_kol/data/nfkb_targets_mouse.txt')
dt <- melt.data.table(markers, id.vars = c('rn', 'condition'), measure.vars = c('pct.1', 'pct.2'))
dt$variable <- ifelse(dt$variable == 'pct.1', 'EGFP_POS', 'EGFP_NEG')
dt$group <- paste(dt$condition, dt$variable)
dtc <- dcast.data.table(dt, rn ~ group, value.var = 'value')
dtc[is.na(dtc)] <- 0
m <- as.matrix(data.frame(dtc[,-1], row.names = dtc[[1]], check.names = F))
genes <- unique(top$rn)
ann_row <- data.frame('condition' = markers[rn %in% genes, .SD[which.min(p_val)],by = rn][match(genes, rn)]$condition,
#'is_nfkb_target' = as.factor(genes %in% nfkb),
row.names = genes)
pheatmap::pheatmap(m[genes, sample_order], scale = 'row',
color = colorPal, cellwidth = 20,
main = 'Ratio of cells expressing top markers',
annotation_row = ann_row,
gaps_col = c(2, 4),
cluster_cols = F,
fontsize = 9, cutree_rows = 3)
We want to apply some things learned from the single-cell data to the bulk RNA-seq data we have analyzed before.
# get RUVSeq normalized counts from bulk rnaseq data
workdir <- '/data/local/buyar/collaborations/marina_acute_colitis/'
sample_sheet <- read.csv(file.path(workdir, 'data/sample_sheet.csv'))
counts <- read.table(file.path(workdir, '/output_ori/feature_counts/raw_counts/hisat2/counts.tsv'), header = T, check.names = F)
colData <- data.frame(sample_sheet[,-1], row.names = sample_sheet$name)
set <- newSeqExpressionSet(counts = as.matrix(counts),
phenoData = colData)
differences <- makeGroups(colData$sample_type)
set_s <- RUVs(set, unique(rownames(set)),
k=10, differences) #all genes
norm_counts <- log(normCounts(set_s)+1)
Make a heatmap of NFKB target genes (exclusively expressed in Cdh1+ epithelial cells only )
# subset norm_counts for nfkb genes
ids <- readRDS('/data/local/buyar/datasets/ensmusg2name.RDS')
nfkb_ids <- ids[match(nfkb, name)]$id
M_nfkb <- norm_counts[nfkb_ids,]
# convert to gene names
rownames(M_nfkb) <- ids[match(rownames(M_nfkb), id)]$name
M <- GetAssayData(seu_epi)
nonepi_cells <- colnames(seu)[seu$celltype != 'Epithelial cells']
M_nonepi <- GetAssayData(seu[,nonepi_cells])
genes <- intersect(rownames(M), rownames(M_nfkb))
# get genes expressed in at least 1% of epithelial cells
genes <- names(which(apply(M[genes,], 1, function(x) (sum(x > 0) / length(x)) * 100) > 0.5))
# remove genes expressed in more than 5% of the non epithelial cells
genes <- names(which(apply(M_nonepi[genes,], 1, function(x) (sum(x > 0) / length(x)) * 100) < 3))
# keep activated / repressed samples only
genes <- names(which(rowSums(M_nfkb[genes,]) > 0))
colData$nfkb <- gsub(".+_", "", colData$sample_type)
colData$dss <- gsub("_.+", "", colData$sample_type)
pheatmap::pheatmap(M_nfkb[genes, ], scale = 'row',
annotation_col = colData[,c('nfkb','dss'),drop=F],
cluster_cols = F,
cellwidth = 15,
color = rev(colorRampPalette(RColorBrewer::brewer.pal(11, "RdYlBu"))(10)),
gaps_col = c(8, 16),
cutree_rows = 5,
show_colnames = F)
We score bulk rnaseq samples using different gene signatures.
score_gene_sets <- function(exprs, genesets) {
rankData <- singscore::rankGenes(exprs)
s <- pbapply::pbsapply(genesets, function(x) {
x <- intersect(x, rownames(rankData))
singscore::simpleScore(rankData, upSet = x)[['TotalScore']]
})
rownames(s) <- colnames(exprs)
return(s)
}
find_differential_signatures <- function(scores, samplesA, samplesB) {
groupA <- intersect(colnames(scores), samplesA)
groupB <- intersect(colnames(scores), samplesB)
message("Comparing ",length(groupA), " samples (A) with ",length(groupB), " samples (B)")
stats <- do.call(rbind, lapply(rownames(scores), function(x) {
a <- scores[x, groupA]
b <- scores[x, groupB]
data.table('signature' = x, 'groupA' = mean(a),
'groupB' = mean(b),
'pval' = wilcox.test(a, b)[['p.value']])
}))
stats$padj <- p.adjust(stats$pval, method = 'BH')
stats <- stats[order(pval)]
return(stats)
}
gex <- data.frame(norm_counts, check.names = F)
# convert to gene names
ids <- readRDS('/data/local/buyar/datasets/ensmusg2name.RDS')
gex$name <- ids[match(rownames(gex), id)]$name
gex <- gex[!is.na(gex$name),]
gex <- gex[!duplicated(gex$name),]
rownames(gex) <- gex$name
gex$name <- NULL
For each condition, we found the signature genes that distinguish egfp+ from egfp- cells. Now, we score the bulk samples by these signatures.
m <- markers[pct.1 > 0.15][p_val_adj < 0.1][order(p_val_adj), .SD[1:51], by = .(condition)][rn != 'Egfp']
egfp_signatures <- sapply(simplify = F, unique(m$condition), function(x) m[condition == x]$rn)
egfp_signature_scores <- score_gene_sets(gex, egfp_signatures)
egfp_signature_scores <- egfp_signature_scores[,c('Untreated', 'DSS', 'Recovery')]
pheatmap::pheatmap(t(egfp_signature_scores), cluster_cols = F,
scale = 'row',
annotation_col = colData[,c('nfkb','dss'),drop=F],
cellwidth = 15, cellheight = 10,
color = colorRampPalette(c("#6298c8", "white", "#ae3d7f"))(10), cluster_rows = F,
show_colnames = F,
gaps_col = c(8, 16),
main = 'EGFP+ condition signature scores in bulk rnaseq')
xcell <- readRDS('/data/local/buyar/datasets/xCell_signatures.mouse_genenames.RDS')
# immune cell groups from xcell
immune_cells <- c("B-cells", "CD4+ T-cells",
"CD8+ T-cells", "DC", "Eosinophils", "Macrophages", "Monocytes",
"Mast cells", "Neutrophils", "NK cells")
xcell_scores <- score_gene_sets(gex, xcell[immune_cells])
pheatmap::pheatmap(t(xcell_scores), scale = 'row',
annotation_col = colData[,c('nfkb','dss'),drop=F],
cellwidth = 15, cellheight = 10,
show_colnames = F,
cluster_cols = F,
gaps_col = c(8, 16),
color = colorRampPalette(c("#6298c8", "white", "#ae3d7f"))(10),
main = 'Immune Cell Type Signature Scores in Bulk RNAseq')
print(sessionInfo())
## R version 4.2.3 (2023-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] Seurat_5.0.3 SeuratObject_5.0.1
## [3] sp_2.1-4 RUVSeq_1.32.0
## [5] edgeR_3.40.2 limma_3.54.2
## [7] EDASeq_2.32.0 ShortRead_1.56.1
## [9] GenomicAlignments_1.34.1 SummarizedExperiment_1.28.0
## [11] MatrixGenerics_1.10.0 matrixStats_1.1.0
## [13] Rsamtools_2.14.0 GenomicRanges_1.50.2
## [15] Biostrings_2.66.0 GenomeInfoDb_1.34.9
## [17] XVector_0.38.0 IRanges_2.32.0
## [19] S4Vectors_0.36.2 BiocParallel_1.32.6
## [21] Biobase_2.58.0 BiocGenerics_0.44.0
## [23] ggridges_0.5.6 ggrepel_0.9.4
## [25] stringr_1.5.0 gprofiler2_0.2.2
## [27] ggpubr_0.6.0 ggplot2_3.4.2
## [29] pheatmap_1.0.12 data.table_1.14.8
## [31] openxlsx_4.2.5.2
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.3 spatstat.explore_3.2-7 reticulate_1.36.1
## [4] R.utils_2.12.3 tidyselect_1.2.0 RSQLite_2.3.1
## [7] AnnotationDbi_1.60.2 htmlwidgets_1.6.2 grid_4.2.3
## [10] Rtsne_0.17 munsell_0.5.0 codetools_0.2-19
## [13] ica_1.0-3 interp_1.1-6 future_1.33.2
## [16] miniUI_0.1.1.1 withr_3.0.0 spatstat.random_3.2-3
## [19] colorspace_2.1-0 progressr_0.14.0 filelock_1.0.3
## [22] highr_0.10 knitr_1.43 rstudioapi_0.15.0
## [25] ROCR_1.0-11 tensor_1.5 ggsignif_0.6.4
## [28] listenv_0.9.1 labeling_0.4.2 GenomeInfoDbData_1.2.9
## [31] polyclip_1.10-6 hwriter_1.3.2.1 farver_2.1.1
## [34] bit64_4.0.5 parallelly_1.37.1 vctrs_0.6.3
## [37] generics_0.1.3 xfun_0.39 BiocFileCache_2.6.1
## [40] R6_2.5.1 locfit_1.5-9.8 spatstat.utils_3.0-4
## [43] bitops_1.0-7 cachem_1.0.8 DelayedArray_0.24.0
## [46] promises_1.2.0.1 BiocIO_1.8.0 scales_1.2.1
## [49] gtable_0.3.3 globals_0.16.3 goftest_1.2-3
## [52] spam_2.10-0 rlang_1.1.3 splines_4.2.3
## [55] rtracklayer_1.58.0 rstatix_0.7.2 lazyeval_0.2.2
## [58] spatstat.geom_3.2-9 broom_1.0.5 reshape2_1.4.4
## [61] yaml_2.3.7 abind_1.4-5 GenomicFeatures_1.50.4
## [64] backports_1.4.1 httpuv_1.6.12 tools_4.2.3
## [67] ellipsis_0.3.2 jquerylib_0.1.4 RColorBrewer_1.1-3
## [70] plyr_1.8.8 Rcpp_1.0.10 progress_1.2.2
## [73] zlibbioc_1.44.0 purrr_1.0.1 RCurl_1.98-1.14
## [76] prettyunits_1.1.1 deldir_2.0-4 pbapply_1.7-2
## [79] cowplot_1.1.1 zoo_1.8-12 cluster_2.1.4
## [82] magrittr_2.0.3 scattermore_1.2 RSpectra_0.16-1
## [85] lmtest_0.9-40 RANN_2.6.1 fitdistrplus_1.1-11
## [88] aroma.light_3.28.0 patchwork_1.2.0 xtable_1.8-4
## [91] hms_1.1.3 mime_0.12 evaluate_0.21
## [94] XML_3.99-0.14 jpeg_0.1-10 gridExtra_2.3
## [97] fastDummies_1.7.3 compiler_4.2.3 biomaRt_2.54.1
## [100] tibble_3.2.1 KernSmooth_2.23-20 crayon_1.5.2
## [103] R.oo_1.26.0 htmltools_0.5.5 later_1.3.1
## [106] tidyr_1.3.0 DBI_1.1.3 dbplyr_2.3.2
## [109] MASS_7.3-58.2 rappdirs_0.3.3 Matrix_1.6-3
## [112] car_3.1-2 cli_3.6.2 R.methodsS3_1.8.2
## [115] parallel_4.2.3 dotCall64_1.1-1 igraph_2.0.3
## [118] pkgconfig_2.0.3 spatstat.sparse_3.0-3 plotly_4.10.2
## [121] xml2_1.3.4 annotate_1.76.0 bslib_0.5.0
## [124] digest_0.6.33 sctransform_0.4.1 RcppAnnoy_0.0.22
## [127] graph_1.76.0 spatstat.data_3.0-4 rmarkdown_2.23
## [130] leiden_0.4.3.1 uwot_0.2.2 GSEABase_1.60.0
## [133] restfulr_0.0.15 curl_5.0.1 shiny_1.7.5.1
## [136] rjson_0.2.21 nlme_3.1-162 lifecycle_1.0.3
## [139] jsonlite_1.8.8 carData_3.0-5 viridisLite_0.4.2
## [142] fansi_1.0.4 pillar_1.9.0 lattice_0.20-45
## [145] KEGGREST_1.38.0 fastmap_1.1.1 httr_1.4.6
## [148] survival_3.5-3 glue_1.6.2 zip_2.3.0
## [151] png_0.1-8 bit_4.0.5 presto_1.0.0
## [154] stringi_1.7.12 sass_0.4.6 blob_1.2.4
## [157] singscore_1.18.0 RcppHNSW_0.6.0 latticeExtra_0.6-30
## [160] memoise_2.0.1 dplyr_1.1.2 irlba_2.3.5.1
## [163] future.apply_1.11.2